Figure 02a - Panel C/D/E

1) Setup

- Defining work directory

In this section, we define the working directory for the R Markdown document.

# Get the path of the current script
# Then get the parent directory of the parent directory of the parent directory
local_wd_folder <- dirname(dirname(dirname(rstudioapi::getSourceEditorContext()$path)))

# Set the root directory for knitr to the local working directory
knitr::opts_knit$set(root.dir = local_wd_folder)

- Defining input data and output directories

Here, we define the directories for input data and output files.

# Get the directory of the current script
script_folder <- dirname(rstudioapi::getSourceEditorContext()$path)

# Define the data folder and output folder
data_folder <- './Data'
#results_folder <- './Results'
figures_folder <- './Figures'

- Setting seed

Setting a seed ensures that any random processes are reproducible.

# Set a seed for reproducibility
set.seed(123)

- Packages installation (optional)

In this section, we ensure that all necessary packages are installed.

# Ensure BiocManager is available for installation of Bioconductor packages
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install(version = "3.18")
## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)
## Old packages: 'knitr', 'uwot'
# Define a list of required packages used in this script
packages_required <- c("ComplexHeatmap", "stringr", 
                        "unikn", "RColorBrewer", "yarrr", 
                        "scales", "ggsci", "R.utils", "GenomicFeatures", 
                       "tximport", "DESeq2", "tidyverse", "plotly", "dplyr", 
                       "ashr", "apeglm", "EnhancedVolcano", "IHW", "circlize", "Matrix")

# Identify any required packages that are not installed
packages_uninstalled <- packages_required[!(packages_required %in% installed.packages()[,"Package"])]

# Install any uninstalled packages
if(length(packages_uninstalled)) BiocManager::install(packages_uninstalled)

- Loading packages

Here, we load the necessary packages for our analysis.

# Load stringr for string manipulation
library(stringr, quietly = TRUE)
library(reticulate)

# Load ComplexHeatmap for creating complex heatmaps
library(ComplexHeatmap, quietly = TRUE)
library(EnhancedVolcano)

library(GenomicFeatures)
library(circlize)
library(tidyverse)

library(R.utils)
library(tximport)
library(DESeq2)
library(dplyr)
library(IHW)
library(ashr)
library(apeglm)
library(plotly)
# To create a conda environment first install either miniconda3 or anaconda3 on your OS
# conda create --name BPCMML python-kaleido plotly -c conda-forge
reticulate::use_condaenv("BPCMML")

- Loading palettes

In this section, we load additional color palettes and define some custom ones.

# Load additional colour palette packages
library(unikn, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(yarrr, quietly = TRUE)
library(scales, quietly = TRUE)
library(ggsci, quietly = TRUE)

# Define a set of custom color palettes from the unikn package
mix_1 <- usecol(pal = c(Karpfenblau, "white", Peach), n = 15)
mix_2 <- usecol(pal = c(rev(pal_seeblau), "white", pal_pinky))
mix_3 <- usecol(pal = c(rev(pal_bordeaux), "white", pal_petrol), n = 15)

# Display the custom palettes
seecol(list(mix_1, mix_2, mix_3), col_brd = "white", lwd_brd = 4, title = "Comparing palettes mixed from unikn colors", pal_names = c("mix_1", "mix_2", "mix_3"))

# Define a second set of custom palettes from the RColorBrewer and yarrr packages
brew_mix <- usecol(c(rev(brewer.pal(n = 4, name = "Reds")), "white", brewer.pal(n = 4, name = "Blues")), n = 13)
brew_ext <- usecol(brewer.pal(n = 11, name = "Spectral"), n = 12)
yarrr_mix <- usecol(c(piratepal("nemo"), piratepal("bugs")))
yarrr_mod <- usecol(c(piratepal("ipod")), n = 9)

# Display the second set of custom palettes
seecol(pal = list(brew_mix, brew_ext, yarrr_mix, yarrr_mod), col_brd = "white", lwd_brd = 2, title = "Using usecol() and seecol() to mix and modify palettes", pal_names = c("brew_mix", "brew_ext", "yarrr_mix", "yarrr_mod"))

# Define additional custom palettes from the scales package
natjournals_palette <- pal_npg("nrc")(10)

- Log Session Info

Finally, we log the session information for reproducibility.

# Write the session information to a text file
writeLines(capture.output(sessionInfo()), file.path(script_folder, 'Panel_A_SessionInfo.txt'))

# Print the session information
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggsci_3.0.3                 scales_1.3.0               
##  [3] yarrr_0.1.5                 BayesFactor_0.9.12-4.7     
##  [5] Matrix_1.6-5                coda_0.19-4.1              
##  [7] jpeg_0.1-10                 RColorBrewer_1.1-3         
##  [9] unikn_1.0.0                 plotly_4.10.4              
## [11] apeglm_1.24.0               ashr_2.2-63                
## [13] IHW_1.30.0                  DESeq2_1.42.1              
## [15] SummarizedExperiment_1.32.0 MatrixGenerics_1.14.0      
## [17] matrixStats_1.3.0           tximport_1.30.0            
## [19] R.utils_2.12.3              R.oo_1.26.0                
## [21] R.methodsS3_1.8.2           lubridate_1.9.3            
## [23] forcats_1.0.0               dplyr_1.1.4                
## [25] purrr_1.0.2                 readr_2.1.5                
## [27] tidyr_1.3.1                 tibble_3.2.1               
## [29] tidyverse_2.0.0             circlize_0.4.16            
## [31] GenomicFeatures_1.54.4      AnnotationDbi_1.64.1       
## [33] Biobase_2.62.0              GenomicRanges_1.54.1       
## [35] GenomeInfoDb_1.38.8         IRanges_2.36.0             
## [37] S4Vectors_0.40.2            BiocGenerics_0.48.1        
## [39] EnhancedVolcano_1.20.0      ggrepel_0.9.5              
## [41] ggplot2_3.5.1               ComplexHeatmap_2.18.0      
## [43] reticulate_1.36.1           stringr_1.5.1              
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.16.0        jsonlite_1.8.8           shape_1.4.6.1           
##   [4] magrittr_2.0.3           rmarkdown_2.26           GlobalOptions_0.1.2     
##   [7] BiocIO_1.12.0            zlibbioc_1.48.2          vctrs_0.6.5             
##  [10] memoise_2.0.1            Rsamtools_2.18.0         RCurl_1.98-1.14         
##  [13] SQUAREM_2021.1           mixsqp_0.3-54            htmltools_0.5.8.1       
##  [16] S4Arrays_1.2.1           progress_1.2.3           curl_5.2.1              
##  [19] truncnorm_1.0-9          SparseArray_1.2.4        sass_0.4.9              
##  [22] bslib_0.7.0              htmlwidgets_1.6.4        plyr_1.8.9              
##  [25] cachem_1.0.8             GenomicAlignments_1.38.2 lifecycle_1.0.4         
##  [28] iterators_1.0.14         pkgconfig_2.0.3          R6_2.5.1                
##  [31] fastmap_1.1.1            GenomeInfoDbData_1.2.11  clue_0.3-65             
##  [34] numDeriv_2016.8-1.1      digest_0.6.35            fdrtool_1.2.17          
##  [37] colorspace_2.1-0         irlba_2.3.5.1            lpsymphony_1.30.0       
##  [40] RSQLite_2.3.6            invgamma_1.1             filelock_1.0.3          
##  [43] fansi_1.0.6              timechange_0.3.0         httr_1.4.7              
##  [46] abind_1.4-5              compiler_4.3.3           bit64_4.0.5             
##  [49] withr_3.0.0              doParallel_1.0.17        BiocParallel_1.36.0     
##  [52] DBI_1.2.2                highr_0.10               biomaRt_2.58.2          
##  [55] MASS_7.3-60.0.1          rappdirs_0.3.3           DelayedArray_0.28.0     
##  [58] rjson_0.2.21             tools_4.3.3              glue_1.7.0              
##  [61] restfulr_0.0.15          cluster_2.1.6            generics_0.1.3          
##  [64] gtable_0.3.5             tzdb_0.4.0               rmdformats_1.0.4        
##  [67] data.table_1.15.4        hms_1.1.3                xml2_1.3.6              
##  [70] utf8_1.2.4               XVector_0.42.0           foreach_1.5.2           
##  [73] pillar_1.9.0             emdbook_1.3.13           BiocFileCache_2.10.2    
##  [76] lattice_0.22-6           rtracklayer_1.62.0       bit_4.0.5               
##  [79] tidyselect_1.2.1         locfit_1.5-9.9           pbapply_1.7-2           
##  [82] Biostrings_2.70.3        knitr_1.45               bookdown_0.39.1         
##  [85] xfun_0.43                stringi_1.8.3            lazyeval_0.2.2          
##  [88] yaml_2.3.8               evaluate_0.23            codetools_0.2-20        
##  [91] bbmle_1.0.25.1           BiocManager_1.30.22      cli_3.6.2               
##  [94] munsell_0.5.1            jquerylib_0.1.4          Rcpp_1.0.12             
##  [97] dbplyr_2.5.0             png_0.1-8                bdsmatrix_1.3-7         
## [100] XML_3.99-0.16.1          parallel_4.3.3           MatrixModels_0.5-3      
## [103] blob_1.2.4               prettyunits_1.2.0        bitops_1.0-7            
## [106] viridisLite_0.4.2        mvtnorm_1.2-4            slam_0.1-50             
## [109] crayon_1.5.2             GetoptLong_1.0.5         rlang_1.1.3             
## [112] KEGGREST_1.42.0

2) Loading input files

- Loading gene annotation files

We use the following code to download a gene annotation file, convert it into a SQLite database for genomic analysis, or load the existing database if it’s already present.

- Load check or creation of sqlite file

# Define the path to the SQLite database
txdb.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite", sep = '')

# Check if the SQLite database already exists
if (file.exists(txdb.filename)) {
  
  # Print a message indicating that the database already exists and is being loaded
  print(sprintf('File %s already exist! Now Loading...',txdb.filename))
  
  # Load the SQLite database
  txdb <- loadDb(txdb.filename)
  
} else {

  # Define the URL of the GTF file to download
  url <- "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.annotation.gtf.gz"
  
  dir.create(paste(data_folder, "/RNASeq/Annotation_files", sep = ''), 
             showWarnings = F)
  
  # Define the local path where the GTF file will be saved
  destfile <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf.gz", sep = '')
  
  # Download the GTF file
  download.file(url, destfile)
  
  # Define the path of the decompressed GTF file
  gtf <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf", sep = '')
  
  # If the decompressed GTF file already exists, remove it
  if (file.exists(gtf)) {
    file.remove(gtf)
  }
  
  # Decompress the GTF file
  gunzip(destfile)

  # Create a TxDb object from the GTF file
  txdb <- makeTxDbFromGFF(gtf)
  
  # Save the TxDb object as a SQLite database for later use
  saveDb(txdb, txdb.filename)
  
  # Load the SQLite database
  txdb <- loadDb(txdb.filename)

  # Print a message indicating that the SQLite database has been generated
  print(sprintf('File %s has been generated!',txdb.filename))

}
## [1] "File ./Data/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite already exist! Now Loading..."

- Load check or creation of csv file

We use the following code to load gene metadata from a CSV file, or if it’s not present, to extract it from a GTF file and create the CSV.

# Define the path to the gene metadata CSV file
gencode_gene_info_df.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv", sep = '')

# Check if the CSV file already exists
if (file.exists(gencode_gene_info_df.filename)) {
  
  # Print a message indicating that the CSV file already exists
  print(sprintf('File %s already exist! Now Loading...',gencode_gene_info_df.filename))
  
  # Load the CSV file into a dataframe
  Genes_annotation_metadata <- read.csv(paste(data_folder, '/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv', sep = ''), header = TRUE)
  
  # Set the row names of the dataframe to the gene IDs
  rownames(Genes_annotation_metadata) <- Genes_annotation_metadata$gene_id
  
} else {
  
  # Define the path to the GTF file and the SQLite database
  gtf <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.gtf", sep = '')
  txdb.filename <- paste(data_folder, "/RNASeq/Annotation_files/GENCODE_v42.annotation.sqlite", sep = '')
  
  # Load the SQLite database
  txdb <- loadDb(txdb.filename)
  
  # Load the GTF file into a dataframe
  gtf_df <- read.table(file = gtf, sep="\t", header=FALSE)
  
  # Filter the dataframe to include only rows where the third column is 'gene'
  gtf_df_genes <- gtf_df %>% dplyr::filter(V3 == 'gene')
  
  # Create an empty dataframe to store the gene metadata
  gencode_gene_info_df <- data.frame()
  
  # Create a progress bar
  pb <- txtProgressBar(min = 0, max = nrow(gtf_df_genes), style = 3)
  
  # Iterate over the rows of the filtered dataframe
  for (gene_index in 1:nrow(gtf_df_genes)) {
    
    # Update the progress bar
    setTxtProgressBar(pb, gene_index)
    
    # Extract the gene information from the ninth column of the current row
    gene_info <- gsub(";\\s+", ";", gtf_df_genes[gene_index,'V9'])
    
    # Split the gene information into items
    items <- strsplit(gene_info, ";")[[1]]
    
    # Iterate over the items and split each one on the space
    for (item in items) {
      parts <- unlist(strsplit(item, " "))
      
      # If there are exactly two parts, add them to the dataframe as a new column
      if (length(parts) == 2) {
        gencode_gene_info_df[unlist(strsplit(items[[1]], " "))[[2]],sprintf('%s',parts[1])] <- parts[2]
      }
    }
  }
  
  # Close the progress bar
  close(pb)
  
  # Write the dataframe to a CSV file
  write.csv(x = gencode_gene_info_df, file = gencode_gene_info_df.filename, row.names = F)
  
  # Print a message indicating that the CSV file has been generated
  print(sprintf('File %s has been generated! Now loading...',gencode_gene_info_df.filename))
  
  # Load the CSV file into a dataframe
  Genes_annotation_metadata <- read.csv(paste(data_folder, '/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv', sep = ''), header = TRUE)
  
  # Set the row names of the dataframe to the gene IDs
  rownames(Genes_annotation_metadata) <- Genes_annotation_metadata$gene_id
}
## [1] "File ./Data/RNASeq/Annotation_files/GENCODE_v42.gene_metadata.csv already exist! Now Loading..."

- Loading sample metadata files

Samples_metadata <- read.csv(paste(data_folder, '/RNASeq/Metadata/RNASeq_Samples_Metadata.csv', sep = ''), row.names = 'Patient.ID')
Samples_metadata$Condition <- factor(Samples_metadata$Condition)
Samples_metadata$Condition <- relevel(Samples_metadata$Condition, ref = "Control")

3) [OPTIONAL]: Importing and grouping raw counts

Only relevant if we start from the raw counts

- Loading counts path

- Gene-level summarization via tximport

- Exporting gene-level counts and TPMs

4) DESeq2 Analysis

- Loading read and TPM matrices

dir.create(paste(data_folder, '/RNASeq/DESeq2', sep = ''), showWarnings = F)

TXI.Genes <- readRDS(paste(data_folder, '/RNASeq/Counts/Tximport_object.rds', sep = ''))

- Filtering non-unique uninformative Ensembl.id

library(DESeq2)
dds = DESeqDataSetFromTximport(txi = TXI.Genes, 
                               colData = Samples_metadata,
                               rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),], 
                               design = ~ Condition)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport

The following filtering step removes many non-informative/duplicated genes

dds <- estimateSizeFactors(dds)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
keep <- rowSums(counts(dds, normalized=TRUE) >= 5 ) >= 3
dds <- dds[keep,]
dds <- DESeq(dds)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3085 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Few non-unique genes will remain. We filter out duplicates with low baseMean.

table(isUnique(rowData(dds)$gene_name))
## 
## FALSE  TRUE 
##   326 26777
rowData(dds)[grep(FALSE,isUnique(rowData(dds)$gene_name)),]
## DataFrame with 326 rows and 31 columns
##                                         gene_id              gene_type
##                                     <character>            <character>
## ENSG00000002586.20           ENSG00000002586.20         protein_coding
## ENSG00000002586.20_PAR_Y ENSG00000002586.20_P..         protein_coding
## ENSG00000015479.20           ENSG00000015479.20         protein_coding
## ENSG00000067601.11           ENSG00000067601.11 transcribed_unproces..
## ENSG00000080947.16           ENSG00000080947.16 transcribed_unproces..
## ...                                         ...                    ...
## ENSG00000291192.1             ENSG00000291192.1                 lncRNA
## ENSG00000291223.1             ENSG00000291223.1                 lncRNA
## ENSG00000291263.1             ENSG00000291263.1                 lncRNA
## ENSG00000291266.1             ENSG00000291266.1                 lncRNA
## ENSG00000291280.1             ENSG00000291280.1                 lncRNA
##                            gene_name     level                 tag     hgnc_id
##                          <character> <integer>         <character> <character>
## ENSG00000002586.20              CD99         2                  NA   HGNC:7082
## ENSG00000002586.20_PAR_Y        CD99         2                 PAR   HGNC:7082
## ENSG00000015479.20             MATR3         1   overlapping_locus   HGNC:6912
## ENSG00000067601.11            PMS2P4         2                  NA   HGNC:9129
## ENSG00000080947.16           CROCCP3         2                  NA  HGNC:29405
## ...                              ...       ...                 ...         ...
## ENSG00000291192.1          ANKRD18DP         2 overlaps_pseudogene  HGNC:28016
## ENSG00000291223.1            FAM86DP         2 overlaps_pseudogene  HGNC:32659
## ENSG00000291263.1             SMG1P7         2 overlaps_pseudogene          NA
## ENSG00000291266.1             SMG1P5         2 overlaps_pseudogene          NA
## ENSG00000291280.1        ANKRD20A11P         2 overlaps_pseudogene          NA
##                                    havana_gene  artif_dupl   baseMean
##                                    <character> <character>  <numeric>
## ENSG00000002586.20       OTTHUMG00000021073.12          NA 4511.64036
## ENSG00000002586.20_PAR_Y OTTHUMG00000021073.12          NA 4528.92115
## ENSG00000015479.20        OTTHUMG00000129229.7          NA 5829.26679
## ENSG00000067601.11        OTTHUMG00000156923.3          NA    1.06017
## ENSG00000080947.16        OTTHUMG00000037885.4          NA   30.82237
## ...                                        ...         ...        ...
## ENSG00000291192.1                           NA          NA     4.9106
## ENSG00000291223.1                           NA          NA   163.7271
## ENSG00000291263.1                           NA          NA    97.4232
## ENSG00000291266.1                           NA          NA   333.7816
## ENSG00000291280.1                           NA          NA    10.6116
##                              baseVar   allZero dispGeneEst dispGeneIter
##                            <numeric> <logical>   <numeric>    <numeric>
## ENSG00000002586.20       10555974.30     FALSE   0.4225708            9
## ENSG00000002586.20_PAR_Y 11701925.40     FALSE   0.4540431           10
## ENSG00000015479.20        2211028.62     FALSE   0.0648926           13
## ENSG00000067601.11             16.77     FALSE  49.0000000            9
## ENSG00000080947.16           1561.80     FALSE   4.7090224            9
## ...                              ...       ...         ...          ...
## ENSG00000291192.1            111.105     FALSE   12.181512           13
## ENSG00000291223.1           8557.502     FALSE    0.412742           14
## ENSG00000291263.1           4090.590     FALSE    0.471501           14
## ENSG00000291266.1          32591.028     FALSE    0.447433           12
## ENSG00000291280.1            396.229     FALSE    6.365591           14
##                            dispFit dispersion  dispIter dispOutlier   dispMAP
##                          <numeric>  <numeric> <integer>   <logical> <numeric>
## ENSG00000002586.20        0.366169   0.426228        10       FALSE  0.426228
## ENSG00000002586.20_PAR_Y  0.366102   0.466530        11       FALSE  0.466530
## ENSG00000015479.20        0.362155   0.075931        14       FALSE  0.075931
## ENSG00000067601.11       75.924728  49.000000        10       FALSE 49.000000
## ENSG00000080947.16        2.947947   4.532790        11       FALSE  4.532790
## ...                            ...        ...       ...         ...       ...
## ENSG00000291192.1        16.664925  12.779288        10       FALSE 12.779288
## ENSG00000291223.1         0.837785   0.432863        13       FALSE  0.432863
## ENSG00000291263.1         1.170841   0.500636        10       FALSE  0.500636
## ENSG00000291266.1         0.588459   0.455304        11       FALSE  0.455304
## ENSG00000291280.1         7.898978   6.494157         8       FALSE  6.494157
##                          Intercept Condition_BP.CMML_vs_Control SE_Intercept
##                          <numeric>                    <numeric>    <numeric>
## ENSG00000002586.20       11.099672                     1.158301     0.356145
## ENSG00000002586.20_PAR_Y 11.034931                     1.233652     0.372600
## ENSG00000015479.20       12.605102                    -0.112644     0.150382
## ENSG00000067601.11       -0.708441                     0.891925     3.867702
## ENSG00000080947.16        5.177179                    -0.272903     1.163871
## ...                            ...                          ...          ...
## ENSG00000291192.1          2.92352                    -0.781207     1.963053
## ENSG00000291223.1          7.60633                    -0.303185     0.360405
## ENSG00000291263.1          6.50084                     0.117172     0.388991
## ENSG00000291266.1          7.97354                     0.467345     0.369223
## ENSG00000291280.1          3.90858                    -0.626191     1.397052
##                          SE_Condition_BP.CMML_vs_Control
##                                                <numeric>
## ENSG00000002586.20                              0.384673
## ENSG00000002586.20_PAR_Y                        0.402444
## ENSG00000015479.20                              0.162441
## ENSG00000067601.11                              4.175538
## ENSG00000080947.16                              1.257398
## ...                                                  ...
## ENSG00000291192.1                               2.121148
## ENSG00000291223.1                               0.389521
## ENSG00000291263.1                               0.420375
## ENSG00000291266.1                               0.398806
## ENSG00000291280.1                               1.509887
##                          WaldStatistic_Intercept
##                                        <numeric>
## ENSG00000002586.20                     31.166165
## ENSG00000002586.20_PAR_Y               29.616032
## ENSG00000015479.20                     83.820712
## ENSG00000067601.11                     -0.183168
## ENSG00000080947.16                      4.448242
## ...                                          ...
## ENSG00000291192.1                        1.48927
## ENSG00000291223.1                       21.10493
## ENSG00000291263.1                       16.71203
## ENSG00000291266.1                       21.59545
## ENSG00000291280.1                        2.79773
##                          WaldStatistic_Condition_BP.CMML_vs_Control
##                                                           <numeric>
## ENSG00000002586.20                                         3.011136
## ENSG00000002586.20_PAR_Y                                   3.065397
## ENSG00000015479.20                                        -0.693446
## ENSG00000067601.11                                         0.213607
## ENSG00000080947.16                                        -0.217038
## ...                                                             ...
## ENSG00000291192.1                                         -0.368294
## ENSG00000291223.1                                         -0.778353
## ENSG00000291263.1                                          0.278731
## ENSG00000291266.1                                          1.171860
## ENSG00000291280.1                                         -0.414727
##                          WaldPvalue_Intercept
##                                     <numeric>
## ENSG00000002586.20               3.06352e-213
## ENSG00000002586.20_PAR_Y         9.29058e-193
## ENSG00000015479.20                0.00000e+00
## ENSG00000067601.11                8.54666e-01
## ENSG00000080947.16                8.65758e-06
## ...                                       ...
## ENSG00000291192.1                 1.36416e-01
## ENSG00000291223.1                 7.16562e-99
## ENSG00000291263.1                 1.07120e-62
## ENSG00000291266.1                1.98215e-103
## ENSG00000291280.1                 5.14625e-03
##                          WaldPvalue_Condition_BP.CMML_vs_Control  betaConv
##                                                        <numeric> <logical>
## ENSG00000002586.20                                    0.00260272      TRUE
## ENSG00000002586.20_PAR_Y                              0.00217381      TRUE
## ENSG00000015479.20                                    0.48802941      TRUE
## ENSG00000067601.11                                    0.83085340      TRUE
## ENSG00000080947.16                                    0.82817887      TRUE
## ...                                                          ...       ...
## ENSG00000291192.1                                       0.712654      TRUE
## ENSG00000291223.1                                       0.436361      TRUE
## ENSG00000291263.1                                       0.780451      TRUE
## ENSG00000291266.1                                       0.241253      TRUE
## ENSG00000291280.1                                       0.678342      TRUE
##                           betaIter  deviance  maxCooks   replace
##                          <numeric> <numeric> <logical> <logical>
## ENSG00000002586.20               4  902.3928        NA     FALSE
## ENSG00000002586.20_PAR_Y         4  905.4087        NA     FALSE
## ENSG00000015479.20               3  853.4472        NA     FALSE
## ENSG00000067601.11               5   59.5232        NA     FALSE
## ENSG00000080947.16              14  378.9929        NA     FALSE
## ...                            ...       ...       ...       ...
## ENSG00000291192.1               11   175.734        NA     FALSE
## ENSG00000291223.1                4   581.103        NA     FALSE
## ENSG00000291263.1                4   534.563        NA     FALSE
## ENSG00000291266.1                4   652.509        NA     FALSE
## ENSG00000291280.1               10   255.859        NA     FALSE
df <- as.data.frame(rowData(dds)[grep(FALSE,isUnique(rowData(dds)$gene_name)),])

# Maximum absolute value of Expression by Gene
maxabs <- with(df, aggregate(baseMean, list(gene_name=gene_name), FUN=function(x) max(abs(x))))

# Combine with original data frame
df <- merge(df, maxabs, by="gene_name")

# Get desired rows
df_unique <- subset(df, abs(baseMean) == x)

# Remove duplicate genes (all gene rows should be identical)
df_unique <- df_unique[!duplicated(df_unique$gene_name), ]

Now we generate a list of unique genes for the real DESeq2 run

unique_genes <- c(rowData(dds)[grep(TRUE,isUnique(rowData(dds)$gene_name)),]$gene_id,
                  df_unique$gene_id)

write.csv(x = data.frame(unique_genes), file = paste(data_folder, '/RNASeq/Counts/List_unique_genes.csv', sep = ''), quote = F, row.names = F)

table(isUnique(unique_genes))
## 
##  TRUE 
## 26928

Number and proportion of duplicated discarded genes

print(table(isUnique(unique_genes))-table(isUnique(rowData(dds)$gene_name))[[2]])
## 
## TRUE 
##  151
print((table(isUnique(unique_genes))-table(isUnique(rowData(dds)$gene_name))[[2]])/table(isUnique(unique_genes)))
## 
##        TRUE 
## 0.005607546

- Running DESeq2 with unique genes

- Testing parametric dispersion vs local

dds_parametric = DESeqDataSetFromTximport(txi = TXI.Genes, 
                               colData = Samples_metadata,
                               rowData = Genes_annotation_metadata[rownames(TXI.Genes$counts),], 
                               design = ~ Condition)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
dds_parametric <- estimateSizeFactors(dds_parametric)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
dds_parametric <- dds_parametric[unique_genes,]
dds_parametric <- DESeq(dds_parametric, fitType = "parametric")
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
Residuals_parametric <- log(mcols(dds_parametric)$dispGeneEst)-log(mcols(dds_parametric)$dispFit)
plotDispEsts(dds_parametric, main= "dispEst: Parametric")

hist(x = Residuals_parametric, xlim = c(-20,10))

Median_Absolute_Residual_parametric <- median(abs(log(mcols(dds_parametric)$dispGeneEst)-log(mcols(dds_parametric)$dispFit)))

dds_parametric$Condition <- relevel(dds_parametric$Condition, ref = "Control")
res_parametric <- results(dds_parametric, contrast=c("Condition", "BP-CMML", "Control"))

dds_local = DESeqDataSetFromTximport(txi = TXI.Genes, 
                               colData = Samples_metadata,
                               rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),], 
                               design = ~ Condition)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
dds_local <- estimateSizeFactors(dds_local)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
dds_local <- dds_local[unique_genes,]
dds_local <- DESeq(dds_local, fitType = "local")
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
Residuals_local <- log(mcols(dds_local)$dispGeneEst)-log(mcols(dds_local)$dispFit)
plotDispEsts(dds_local, main= "dispEst: Local")

hist(x = Residuals_local, xlim = c(-20,10))

Median_Absolute_Residual_local <- median(abs(log(mcols(dds_local)$dispGeneEst)-log(mcols(dds_local)$dispFit)))

dds_local$Condition <- relevel(dds_local$Condition, ref = "Control")
res_local <- results(dds_local, contrast=c("Condition", "BP-CMML", "Control"))

if ((Median_Absolute_Residual_parametric < Median_Absolute_Residual_local) == TRUE) {
  Best_scoring_fitType <- 'parametric'
} else {
  Best_scoring_fitType <- 'local'
}
dds = DESeqDataSetFromTximport(txi = TXI.Genes, 
                               colData = Samples_metadata,
                               rowData= Genes_annotation_metadata[rownames(TXI.Genes$counts),], 
                               design = ~ Condition)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
dds <- estimateSizeFactors(dds)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
dds <- dds[unique_genes,]
dds <- DESeq(dds, fitType = Best_scoring_fitType)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 3046 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

- Generating VST and RLOG normalised counts

# Generating vst normalised counts

vst <- vst(dds, blind=FALSE)
head(assay(vst), 3)
##                           55      136      171       246       279       293
## ENSG00000000003.15  8.065115 7.654697 7.414760  7.414760  7.414760  7.414760
## ENSG00000000419.14 10.210298 9.617510 9.904598 10.577354 10.088775 10.913115
## ENSG00000000457.14  9.139483 9.401286 9.711656  8.671656  9.198039  8.560446
##                          303       316       343      416      495       543
## ENSG00000000003.15  7.665355  7.414760  7.701553 7.985013 7.654199  7.799596
## ENSG00000000419.14 10.130760 10.127476 10.263280 9.980862 9.962822 10.406475
## ENSG00000000457.14  9.276793  9.020454  9.631191 9.566639 8.935847  9.143235
##                         544       620       660      671      689      697
## ENSG00000000003.15 7.414760  7.414760  7.414760 7.983726 7.414760 7.998331
## ENSG00000000419.14 9.926808 10.058707 10.012602 9.849691 9.827424 9.737310
## ENSG00000000457.14 9.113867  8.993742  9.088332 8.984527 9.670583 9.043966
##                          713       745      756      776      783       812
## ENSG00000000003.15  7.414760  7.414760 8.464892 7.612095 7.414760  7.414760
## ENSG00000000419.14 10.568712 10.224058 9.727712 9.502122 9.807414 10.136797
## ENSG00000000457.14  8.833221  9.023978 9.099994 9.174390 8.947940  9.535468
##                          821      822      823       837      872       888
## ENSG00000000003.15  8.215109 7.414760 7.414760  7.414760 7.908169  7.668771
## ENSG00000000419.14 10.185967 9.592477 9.634643 10.291304 9.419343 10.278589
## ENSG00000000457.14  9.316280 9.002598 9.363783  9.175015 8.945857  9.673822
##                          919      930      963      965       993     1007
## ENSG00000000003.15  7.414760 7.414760 7.414760 8.055938  8.070688 7.789379
## ENSG00000000419.14 10.645181 9.939289 9.856903 9.649153 10.044526 9.763629
## ENSG00000000457.14  8.463859 8.987876 9.055457 9.478553  9.529790 9.742334
##                        1049      1061     1068      1122     1159     1217
## ENSG00000000003.15 7.767139  7.906386 7.414760  7.414760 7.414760 8.602357
## ENSG00000000419.14 9.921815 10.334517 9.962073 10.385162 9.982011 9.906338
## ENSG00000000457.14 9.441179  9.372741 9.367584  8.958028 9.390808 9.396863
##                         929     1126     1138     1157      1162      1170
## ENSG00000000003.15 8.162077 8.245891 8.523467 7.736537  7.897546  8.083725
## ENSG00000000419.14 9.951198 9.938706 9.817489 9.804952 10.014576 10.257753
## ENSG00000000457.14 9.042888 9.313433 9.171286 9.030463  9.278497  9.266932
##                         1207
## ENSG00000000003.15  7.750065
## ENSG00000000419.14 10.022456
## ENSG00000000457.14  9.046908
vst_mat <- as.data.frame(vst@assays@data@listData[[1]]) %>% add_column(GeneSymbol = as.data.frame(rowData(vst))[,'gene_name'], .before = colnames(dds)[1]) %>% add_column(GeneBiotype = as.data.frame(rowData(vst))[,'gene_type'], .before = colnames(dds)[1]) 

write.csv(vst_mat, file = paste(data_folder, '/RNASeq/Counts/Processed_counts/Grouped/DGE_DESeq2_vst_normalised_countmat.csv', sep = ''), quote = F, row.names = T)

# Generating rlog normalised counts

rlog <- rlogTransformation(dds, blind=FALSE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
head(assay(rlog), 3)
##                          55      136      171       246      279       293
## ENSG00000000003.15 3.950361 3.368203 3.221205  3.225189 3.221423  3.229764
## ENSG00000000419.14 9.710953 9.028083 9.369426 10.098703 9.577577 10.436407
## ENSG00000000457.14 8.122866 8.453487 8.815104  7.448380 8.199202  7.275077
##                         303      316      343      416      495      543
## ENSG00000000003.15 3.382674 3.224439 3.424512 3.833022 3.369032 3.556126
## ENSG00000000419.14 9.623872 9.620333 9.768216 9.456551 9.436106 9.920738
## ENSG00000000457.14 8.299244 7.962929 8.723668 8.649793 7.845588 8.127792
##                         544      620      660      671      689      697
## ENSG00000000003.15 3.223315 3.223584 3.230236 3.832398 3.224449 3.855451
## ENSG00000000419.14 9.395008 9.544094 9.492415 9.305826 9.279837 9.173208
## ENSG00000000457.14 8.089030 7.926600 8.055185 7.912947 8.768610 7.994768
##                          713      745      756      776      783      812
## ENSG00000000003.15  3.223922 3.228140 4.542076 3.325656 3.225624 3.224430
## ENSG00000000419.14 10.089914 9.725823 9.161917 8.885234 9.256485 9.630602
## ENSG00000000457.14  7.696005 7.968017 8.070511 8.168570 7.861710 8.613107
##                         821      822      823      837      872      888
## ENSG00000000003.15 4.183403 3.225191 3.226805 3.229318 3.717059 3.384739
## ENSG00000000419.14 9.684511 8.997572 9.049730 9.798151 8.779474 9.784720
## ENSG00000000457.14 8.349083 7.938325 8.407593 8.169371 7.858635 8.772361
##                          919      930      963      965      993     1007
## ENSG00000000003.15  3.225794 3.221746 3.225663 3.942258 3.958647 3.541928
## ENSG00000000419.14 10.168205 9.409230 9.314316 9.066893 9.528249 9.204553
## ENSG00000000457.14  7.105261 7.917604 8.010824 8.546149 8.606393 8.849341
##                        1049     1061     1068     1122     1159     1217
## ENSG00000000003.15 3.511106 3.710542 3.221791 3.225425 3.219552 4.727998
## ENSG00000000419.14 9.389246 9.844532 9.435209 9.898169 9.457843 9.371437
## ENSG00000000457.14 8.501459 8.418749 8.412416 7.876535 8.440902 8.448128
##                         929     1126     1138     1157     1162     1170
## ENSG00000000003.15 4.104050 4.228812 4.620055 3.468986 3.701202 3.985704
## ENSG00000000419.14 9.422825 9.408579 9.268253 9.253361 9.494635 9.762253
## ENSG00000000457.14 7.993334 8.345474 8.164499 7.976193 8.301795 8.287062
##                        1207
## ENSG00000000003.15 3.487348
## ENSG00000000419.14 9.503502
## ENSG00000000457.14 7.998717
rlog_mat <- as.data.frame(rlog@assays@data@listData[[1]]) %>% add_column(GeneSymbol = as.data.frame(rowData(rlog))[,'gene_name'], .before = colnames(dds)[1]) %>% add_column(GeneBiotype = as.data.frame(rowData(rlog))[,'gene_type'], .before = colnames(dds)[1]) 

write.csv(rlog_mat, file = paste(data_folder, '/RNASeq/Counts/Processed_counts/Grouped/DGE_DESeq2_rlog_normalised_countmat.csv', sep = ''), quote = F, row.names = T)

- Performing PCA analysis

rv <- rowVars(assay(vst))
# select the ntop genes by variance
select <- order(rv, decreasing=TRUE)[seq_len(min(500, length(rv)))]
# perform a PCA on the data in assay(x) for the selected genes
pca <- prcomp(t(assay(vst)[select,]))

pca_samples <- as.data.frame(pca$x)
pca_samples$Condition <- colData(dds)[rownames(pca_samples),'Condition']

pca_genes <- as.data.frame(pca$rotation)
pca_genes <- add_column(pca_genes, .before = 'PC1', GeneSymbol = rowData(dds)[rownames(pca_genes),'gene_name'])

dir.create(path = paste(data_folder, '/RNASeq/DESeq2/PCs', sep = ''), showWarnings = F)

write.csv(pca_samples, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_Samples_PCs_from_rlog_normalised_count_mat.csv', sep = ''), quote = F, row.names = T)

write.csv(pca_genes, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_Genes_PCs_from_rlog_normalised_count_mat.csv', sep = ''), quote = F, row.names = T)

var_explained_df <- data.frame(PC= paste0("PC",1:49),
                               var_explained=(pca$sdev)^2/sum((pca$sdev)^2))

write.csv(var_explained_df, file = paste(data_folder, '/RNASeq/DESeq2/PCs/DGE_PC_variance_explained.csv', sep = ''), quote = F, row.names = T)

tot_var_explained <- (var_explained_df$var_explained[1] + 
  var_explained_df$var_explained[2] +
  var_explained_df$var_explained[3]) * 100

- Plotting 3d PCs via plotly

Font_characteristics <- list(
  family = "Helvetica",
  size = 14,
  color = "black")

- By condition

With legend

fig <- plot_ly(pca_samples, 
               x = ~PC2, y = ~PC1, z = ~PC3, color = ~Condition, 
               colors = c('BP-CMML' = '#DD3429',
                          'Control' = '#A4BECE') ) %>% add_markers(size = 12, marker = list (size = 12))


fig <- fig %>% layout(
    title = '',
    showlegend = TRUE,
    legend = list(
        title = '', 
        font = Font_characteristics, 
        x = 0.46, # Move the legend more to the left
        y = 0.0, # Move the legend slightly up
        xanchor = 'center',
        yanchor = 'bottom'
    ),
    scene = list(
        bgcolor = "#ffffff",
        camera = list(
            eye = list(
                x = -1.75,
                y = 1.85,
                z = 1.04
            ),
            center = list(x = -0.425,
                          y = 0.35, # Move the figure up
                          z = 0)
        )
    )
)

# This requires loading python environment with kaleido installed

save_image(fig, paste(figures_folder, '/Not_Included/DGE_Condition_with_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)

Without legend

fig <- plot_ly(pca_samples, 
               x = ~PC2, y = ~PC1, z = ~PC3, color = ~Condition, 
               colors = c('BP-CMML' = '#DD3429',
                          'Control' = '#A4BECE') ) %>% add_markers(size = 10, marker = list (size = 10))


fig <- fig %>% layout(
    title = '',
    showlegend = FALSE,
    legend = list(
        title = '', 
        font = Font_characteristics, 
        x = 0.45, # Move the legend more to the left
        y = -0.03, # Move the legend slightly up
        xanchor = 'center',
        yanchor = 'bottom'
    ),
    scene = list(
        bgcolor = "#ffffff",
        camera = list(
            eye = list(
                x = -1.75,
                y = 1.85,
                z = 1.04
            ),
            center = list(x = -0.425,
                          y = 0.35, # Move the figure up
                          z = 0)
        )
    )
)

save_image(fig, paste(figures_folder, '/Not_Included/DGE_Condition_without_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)

- By ICC classification

With legend

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Figure 2C. PCA plot showing samples’ coordinates for top three principal components ranked by variance explained and identified by PCA analysis of VST normalised count data. Samples are coloured by disease state/ICC classification as indicated

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Without legend

pca_samples_ICC <- as.data.frame(pca$x)
pca_samples_ICC$ICC.classification <- colData(dds)[rownames(pca_samples),'ICC.classification']

fig <- plot_ly(pca_samples_ICC, 
               x = ~PC2, y = ~PC1, z = ~PC3, color = ~ICC.classification,
               colors = c('BPDCN' = '#268418',
                          'AML with MDS-related cytogenetic abnormalities' = '#B533B8',
                          'AML with MDS-related gene mutations' = '#FB84A2',
                          'AML with NPM1' = '#1754B8',
                          'AML with CEBPA' = '#FFCB7C',
                          'AML not otherwise specified' = '#8E3318',
                          'Control' = '#A4BECE'),
               marker = list(size = 12)) 


fig <- fig %>% layout(
    title = '',
    showlegend = FALSE,
    legend = list(
        title = '', 
        font = Font_characteristics, 
        x = 0.49, # Move the legend more to the left
        y = -0.03, # Move the legend slightly up
        xanchor = 'center',
        yanchor = 'bottom'
    ),
    scene = list(
        bgcolor = "#ffffff",
        camera = list(
            eye = list(
                x = -1.75,
                y = 1.85,
                z = 1.04
            ),
            center = list(x = -0.425,
                          y = 0.35, # Move the figure up
                          z = 0)
        )
    )
)

save_image(fig, paste(figures_folder, '/Not_Included/DGE_ICC_without_legend.png', sep = ''), width = 3 * 300, height = 3 * 300, scale = 2)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

- By WHO classification

With legend

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Figure S2. PCA plot showing samples’ coordinates for top three principal components ranked by variance explained and identified by PCA analysis of VST normalised count data. Samples are coloured by disease state/WHO classification as indicated

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

With legend

pca_samples_WHO <- as.data.frame(pca$x)
pca_samples_WHO$WHO.classification <- colData(dds)[rownames(pca_samples),'WHO.classification']

fig <- plot_ly(pca_samples_WHO, 
               x = ~PC2, y = ~PC1, z = ~PC3, color = ~WHO.classification,
               colors = c('BPDCN' = '#268418',
                          'AML myelodysplasia-related' = '#FB84A2',
                          'AML with NPM1' = '#1754B8',
                          'AML with CEBPA' = '#FFCB7C',
                          'AML with other defined genetic alterations' = '#8E3318',
                          'Control' = '#A4BECE'),
               marker = list(size = 12)) 


fig <- fig %>% layout(
    title = '',
    showlegend = FALSE,
    legend = list(
        title = '', 
        font = Font_characteristics, 
        x = 0.46, # Move the legend more to the left
        y = 0.0, # Move the legend slightly up
        xanchor = 'center',
        yanchor = 'bottom'
    ),
    scene = list(
        bgcolor = "#ffffff",
        camera = list(
            eye = list(
                x = -1.75,
                y = 1.85,
                z = 1.04
            ),
            center = list(x = -0.425,
                          y = 0.35, # Move the figure up
                          z = 0)
        )
    )
)

save_image(fig, paste(figures_folder, '/Not_Included/DGE_WHO_with_legend.png', sep = ''), width = 2.5 * 300, height = 2.5 * 300, scale = 2)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

- DEGs

TF_list <- read.csv(paste(data_folder, '/RNASeq/Resources/Gene_Metadata/Lambert2018_TFs.txt', sep = ''), sep = '\t', header = F)

HGNC_CD_markers_list <- read.csv(paste(data_folder, '/RNASeq/Resources/Gene_Metadata/HGNC_CD_Markers.csv', sep = ''))

- BP-CMML vs Control

- Evaluating number of significant genes

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision

## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision

## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision

## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the line search routine failed, possibly due to insufficient
## numeric precision
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the moving direction increases the objective function value

- Performing comparison

dir.create(path = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons', sep = ''), 
           showWarnings = F)
dir.create(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML', sep = ''), 
           showWarnings = F)

resAsh_df <- as.data.frame(resAsh) %>% add_column(GeneSymbol =rowData(dds)[rownames(resAsh),'gene_name'], .before = 'baseMean') %>% add_column(GeneBiotype =rowData(dds)[rownames(resAsh),'gene_type'], .before = 'baseMean')

resAsh_df$Is_TF <- 'No'
resAsh_df[grep(TRUE, resAsh_df$GeneSymbol %in% TF_list$V1),'Is_TF'] <- 'Yes'

resAsh_df$Is_CD_Marker <- 'No'
resAsh_df[grep(TRUE, resAsh_df$GeneSymbol %in% HGNC_CD_markers_list$Approved.symbol),'Is_CD_Marker'] <- 'Yes'

res_significant <- resAsh_df %>% arrange(padj)
write.csv(res_significant, 
          file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_ALL_DEGs_LFC_Ashr.csv', sep = ''), 
          row.names = T, quote = FALSE)

res_significant <- resAsh_df %>% dplyr::filter(padj < 0.05) %>% dplyr::arrange(padj)
write.csv(res_significant, 
          file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
          row.names = T, quote = FALSE)

res_significant_UP <- res_significant %>% dplyr::filter(log2FoldChange >= 0) %>% dplyr::arrange(padj)
write.csv(res_significant_UP, 
          file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_upregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
          row.names = T, quote = FALSE)

res_significant_DOWN <- res_significant %>% dplyr::filter(log2FoldChange < 0) %>% dplyr::arrange(padj)
write.csv(res_significant_DOWN, 
          file = paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_downregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''),
          row.names = T, quote = FALSE)

5) Vulcano plot

DESeq2_results <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_ALL_DEGs_LFC_Ashr.csv', sep = ''))
Figure 2D. Volcano plot of differentially expressed genes comparing BP- CMML to healthy age-matched control HSPCs.

Figure 2D. Volcano plot of differentially expressed genes comparing BP- CMML to healthy age-matched control HSPCs.

6) Heatmap uniquely up genes

UP_in_BP_CMML <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_upregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''), row.names = 'X')

UP_in_BP_CMML <- UP_in_BP_CMML %>% filter(GeneBiotype == 'protein_coding')

BP_CMML_genes <- (UP_in_BP_CMML %>% arrange(padj))$GeneSymbol[1:30]

# - - - - - - - - - - -

UP_in_healthy <- read.csv(paste(data_folder, '/RNASeq/DESeq2/DEGs_Comparisons/Control_vs_BP-CMML/DGE_BP-CMML_vs_Control_downregulated_padj_significant_DEGs_LFC_Ashr.csv', sep = ''), row.names = 'X')

UP_in_healthy <- UP_in_healthy %>% filter(GeneBiotype == 'protein_coding')

Healthy_genes <- ((UP_in_healthy %>% arrange(padj))$GeneSymbol[1:30])

genes_to_keep <- c(BP_CMML_genes, Healthy_genes)
Figure 2E. Heatmap showing the top 30 up- and down-regulated DEGs per condition, ranked by p.adj value.

Figure 2E. Heatmap showing the top 30 up- and down-regulated DEGs per condition, ranked by p.adj value.

7) Calculating condition transcriptional heterogeneity

dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other', sep = ''), 
           showWarnings = F)
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion', sep = ''), 
           showWarnings = F)
dir.create(path = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion/Control_vs_BP-CMML', sep = ''), 
           showWarnings = F)

Control_samples <- (Samples_metadata %>% mutate(Sample.ID = rownames(Samples_metadata)) %>% filter(Condition == 'Control'))$Sample.ID

BPCMML_samples <- (Samples_metadata %>% mutate(Sample.ID = rownames(Samples_metadata)) %>% filter(Condition == 'BP-CMML'))$Sample.ID


TXI.Genes_Control <- TXI.Genes
TXI.Genes_Control$abundance <- TXI.Genes_Control$abundance[,Control_samples]
TXI.Genes_Control$counts <- TXI.Genes_Control$counts[,Control_samples]
TXI.Genes_Control$length <- TXI.Genes_Control$length[,Control_samples]

dds_control = DESeqDataSetFromTximport(txi = TXI.Genes_Control, 
                               colData = Samples_metadata[Control_samples,],
                               rowData= Genes_annotation_metadata[rownames(TXI.Genes_Control$counts),], 
                               design = ~1)
## using counts and average transcript lengths from tximport
dds_control <- estimateSizeFactors(dds_control)
## using 'avgTxLength' from assays(dds), correcting for library size
dds_control <- dds_control[unique_genes,]
dds_control <- DESeq(dds_control, fitType = "local")
## Warning in DESeq(dds_control, fitType = "local"): the design is ~ 1 (just an
## intercept). is this intended?
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 995 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
TXI.Genes_BPCMML <- TXI.Genes
TXI.Genes_BPCMML$abundance <- TXI.Genes_BPCMML$abundance[,BPCMML_samples]
TXI.Genes_BPCMML$counts <- TXI.Genes_BPCMML$counts[,BPCMML_samples]
TXI.Genes_BPCMML$length <- TXI.Genes_BPCMML$length[,BPCMML_samples]

dds_BPCMML = DESeqDataSetFromTximport(txi = TXI.Genes_BPCMML, 
                               colData = Samples_metadata[BPCMML_samples,],
                               rowData= Genes_annotation_metadata[rownames(TXI.Genes_BPCMML$counts),], 
                               design = ~1)
## using counts and average transcript lengths from tximport
dds_BPCMML <- estimateSizeFactors(dds_BPCMML)
## using 'avgTxLength' from assays(dds), correcting for library size
dds_BPCMML <- dds_BPCMML[unique_genes,]
dds_BPCMML <- DESeq(dds_BPCMML, fitType = "local")
## Warning in DESeq(dds_BPCMML, fitType = "local"): the design is ~ 1 (just an
## intercept). is this intended?
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4429 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
df <- data.frame(row.names = rownames(rowData(dds)))
df$Disp_Control <- rowData(dds_control)[,'dispersion']
df$Disp_BPCMML <- rowData(dds_BPCMML)[,'dispersion']

write.csv(df, 
          file = paste(data_folder, '/RNASeq/DESeq2/Other/Gene_Dispersion/Control_vs_BP-CMML/Gene_Dispersion_per_condition.csv', sep = ''),
          row.names = T)